import os
import math
import numpy as np
from ctypes import CDLL, POINTER, c_int32, c_int16, c_bool
from IPython.display import display, Audio, Markdown
from scipy.signal import spectrogram
import matplotlib.pyplot as plt
from pydub import AudioSegment
# Init C interface
current_dir = os.path.dirname(os.path.realpath(os.path.abspath('')))
so_file = os.path.normpath(current_dir + '/build/libdsp.so')
c_interface = CDLL(so_file)
c_interface.init_audio_accumulator.argtypes = (
c_int32,
c_int32,
c_int32,
c_int32,
c_int32,
)
c_interface.init_audio_accumulator.restype = None
c_interface.accumulate_input.argtypes = [
np.ctypeslib.ndpointer(dtype=np.int16, ndim=1, flags='CONTIGUOUS')
]
c_interface.accumulate_input.restype = None
c_interface.extract_output.argtypes = []
c_interface.extract_output.restype = POINTER(c_int16)
c_interface.is_output_ready.argtypes = []
c_interface.is_output_ready.restype = c_bool
# Init audio accumulator
NUM_INPUT_FRAMES = 1024
NUM_PROCESSING_FRAMES = (NUM_INPUT_FRAMES * 2)
NUM_OUTPUT_FRAMES = 2048
NUM_DELAY_FRAMES = 2048
SAMPLE_RATE = 22050
OSAMP = 2
NUM_CHANNELS = 2
SAMPLES_PER_SECOND = SAMPLE_RATE * NUM_CHANNELS
c_interface.init_audio_accumulator(
NUM_INPUT_FRAMES,
NUM_OUTPUT_FRAMES,
NUM_PROCESSING_FRAMES,
OSAMP,
SAMPLE_RATE
)
# Display audio functions
def display_spectrogram(audio):
f, t, Sxx = spectrogram(audio, fs=SAMPLE_RATE, window='hann', nperseg=1024)
plt.pcolormesh(t, f, 10 * np.log10(Sxx + 1e-10), shading='gouraud')
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.colorbar(label='dB')
plt.show()
def display_audio(name, audio_l, audio_r):
display(
Markdown('### ' + name),
Audio(data=(audio_l, audio_r), rate=SAMPLE_RATE)
)
audio_mono = (audio_l.astype(np.float32) + audio_r.astype(np.float32)) / 2
display_spectrogram(audio_mono)
# Audio processing function
def process_audio(audio_file_name):
# Load audio (and interleave it like a real world audio signal)
file_path = "audio/" + audio_file_name + ".mp3"
audio = AudioSegment.from_mp3(file_path).set_frame_rate(SAMPLE_RATE)
input_audio = np.array(audio.get_array_of_samples())
start_sample = SAMPLES_PER_SECOND * 30
end_sample = SAMPLES_PER_SECOND * 60
input_audio = input_audio[start_sample:end_sample]
# Process each window of audio
output_audio = np.empty((0,), dtype=np.int16)
num_channels = 2
samples_per_input_buffer = NUM_INPUT_FRAMES * num_channels
samples_per_output_buffer = NUM_OUTPUT_FRAMES * num_channels
n_windows = math.floor(len(input_audio) / samples_per_input_buffer)
for i in range(n_windows):
start = i * samples_per_input_buffer
end = start + samples_per_input_buffer
window = input_audio[start:end]
c_interface.accumulate_input(window)
is_output_ready = c_interface.is_output_ready()
if is_output_ready:
output_audio_buffer = c_interface.extract_output()
output_audio_nparray = np.ctypeslib.as_array(output_audio_buffer, shape=(samples_per_output_buffer,))
output_audio = np.concatenate([output_audio, output_audio_nparray])
# Deinterleave audio
input_l = input_audio[::2]
input_r = input_audio[1::2]
output_l = output_audio[::2]
output_r = output_audio[1::2]
# Display output
input_mono = (input_l.astype(np.float32) + input_r.astype(np.float32)) / 2
display_audio("Input (stereo)", input_mono, input_mono)
display_audio("Output (left)", output_l, output_l)
display_audio("Output (right)", output_r, output_r)
display_audio("Output (stereo)", output_l, output_r)
# Process several audio tracks
audio_file_names = [
'Bad Bunny - NUEVAYoL',
'Miley Cyrus - We Can\'t Stop',
'SZA - BMF',
]
for audio_file_name in audio_file_names:
display(
Markdown('## ' + audio_file_name)
)
process_audio(audio_file_name)